/*-------------------------------------------------------------------------
 *
 * fd.c
 *      Virtual file descriptor code.
 *
 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * This source code file contains modifications made by THL A29 Limited ("Tencent Modifications").
 * All Tencent Modifications are Copyright (C) 2023 THL A29 Limited.
 *
 * IDENTIFICATION
 *      src/backend/storage/file/fd.c
 *
 * NOTES:
 *
 * This code manages a cache of 'virtual' file descriptors (VFDs).
 * The server opens many file descriptors for a variety of reasons,
 * including base tables, scratch files (e.g., sort and hash spool
 * files), and random calls to C library routines like system(3); it
 * is quite easy to exceed system limits on the number of open files a
 * single process can have.  (This is around 256 on many modern
 * operating systems, but can be as low as 32 on others.)
 *
 * VFDs are managed as an LRU pool, with actual OS file descriptors
 * being opened and closed as needed.  Obviously, if a routine is
 * opened using these interfaces, all subsequent operations must also
 * be through these interfaces (the File type is not a real file
 * descriptor).
 *
 * For this scheme to work, most (if not all) routines throughout the
 * server should use these interfaces instead of calling the C library
 * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
 * may find ourselves short of real file descriptors anyway.
 *
 * INTERFACE ROUTINES
 *
 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
 * A File opened with OpenTemporaryFile is automatically deleted when the
 * File is closed, either explicitly or implicitly at end of transaction or
 * process exit. PathNameOpenFile is intended for files that are held open
 * for a long time, like relation files. It is the caller's responsibility
 * to close them, there is no automatic mechanism in fd.c for that.
 *
 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
 * They behave like the corresponding native functions, except that the handle
 * is registered with the current subtransaction, and will be automatically
 * closed at abort. These are intended mainly for short operations like
 * reading a configuration file; there is a limit on the number of files that
 * can be opened using these functions at any one time.
 *
 * Finally, BasicOpenFile is just a thin wrapper around open() that can
 * release file descriptors in use by the virtual file descriptors if
 * necessary. There is no automatic cleanup of file descriptors returned by
 * BasicOpenFile, it is solely the caller's responsibility to close the file
 * descriptor by calling close(2).
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_SYS_RESOURCE_H
#include <sys/resource.h>        /* for getrlimit */
#endif

#include "miscadmin.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/pg_tablespace.h"
#include "pgstat.h"
#include "portability/mem.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#ifdef PGXC
#include "pgxc/pgxc.h"
#endif
#ifdef _SHARDING_
#include <linux/falloc.h>
#include <errno.h>
#endif

#include "utils/resowner_private.h"


/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
#define PG_FLUSH_DATA_WORKS 1
#elif !defined(WIN32) && defined(MS_ASYNC)
#define PG_FLUSH_DATA_WORKS 1
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
#define PG_FLUSH_DATA_WORKS 1
#endif

/*
 * We must leave some file descriptors free for system(), the dynamic loader,
 * and other code that tries to open files without consulting fd.c.  This
 * is the number left free.  (While we can be pretty sure we won't get
 * EMFILE, there's never any guarantee that we won't get ENFILE due to
 * other processes chewing up FDs.  So it's a bad idea to try to open files
 * without consulting fd.c.  Nonetheless we cannot control all code.)
 *
 * Because this is just a fixed setting, we are effectively assuming that
 * no such code will leave FDs open over the long term; otherwise the slop
 * is likely to be insufficient.  Note in particular that we expect that
 * loading a shared library does not result in any permanent increase in
 * the number of open files.  (This appears to be true on most if not
 * all platforms as of Feb 2004.)
 */
#define NUM_RESERVED_FDS        10

/*
 * If we have fewer than this many usable FDs after allowing for the reserved
 * ones, choke.
 */
#define FD_MINFREE                10


/*
 * A number of platforms allow individual processes to open many more files
 * than they can really support when *many* processes do the same thing.
 * This GUC parameter lets the DBA limit max_safe_fds to something less than
 * what the postmaster's initial probe suggests will work.
 */
int            max_files_per_process = 1000;

/*
 * Maximum number of file descriptors to open for either VFD entries or
 * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
 * to a conservative value, and remains that way indefinitely in bootstrap or
 * standalone-backend cases.  In normal postmaster operation, the postmaster
 * calls set_max_safe_fds() late in initialization to update the value, and
 * that value is then inherited by forked subprocesses.
 *
 * Note: the value of max_files_per_process is taken into account while
 * setting this variable, and so need not be tested separately.
 */
int            max_safe_fds = 32;    /* default if not changed */


/* Debugging.... */

#ifdef FDDEBUG
#define DO_DB(A) \
    do { \
        int            _do_db_save_errno = errno; \
        A; \
        errno = _do_db_save_errno; \
    } while (0)
#else
#define DO_DB(A) \
    ((void) 0)
#endif

#define VFD_CLOSED (-1)

#define FileIsValid(file) \
    ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)

#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)

/*
 * Note: a VFD's seekPos is normally always valid, but if for some reason
 * an lseek() fails, it might become set to FileUnknownPos.  We can struggle
 * along without knowing the seek position in many cases, but in some places
 * we have to fail if we don't have it.
 */
#define FileUnknownPos ((off_t) -1)
#define FilePosIsUnknown(pos) ((pos) < 0)

/* these are the assigned bits in fdstate below: */
#define FD_TEMPORARY        (1 << 0)    /* T = delete when closed */
#define FD_XACT_TEMPORARY    (1 << 1)    /* T = delete at eoXact */

typedef struct vfd
{
    int            fd;                /* current FD, or VFD_CLOSED if none */
    unsigned short fdstate;        /* bitflags for VFD's state */
    ResourceOwner resowner;        /* owner, for automatic cleanup */
    File        nextFree;        /* link to next free VFD, if in freelist */
    File        lruMoreRecently;    /* doubly linked recency-of-use list */
    File        lruLessRecently;
    off_t        seekPos;        /* current logical file position, or -1 */
    off_t        fileSize;        /* current size of file (0 if not temporary) */
    char       *fileName;        /* name of file, or NULL for unused VFD */
    /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
    int            fileFlags;        /* open(2) flags for (re)opening the file */
    int            fileMode;        /* mode to pass to open(2) */
} Vfd;

/*
 * Virtual File Descriptor array pointer and size.  This grows as
 * needed.  'File' values are indexes into this array.
 * Note that VfdCache[0] is not a usable VFD, just a list header.
 */
static Vfd *VfdCache;
static Size SizeVfdCache = 0;

/*
 * Number of file descriptors known to be in use by VFD entries.
 */
static int    nfile = 0;

/*
 * Flag to tell whether it's worth scanning VfdCache looking for temp files
 * to close
 */
static bool have_xact_temporary_files = false;

/*
 * Tracks the total size of all temporary files.  Note: when temp_file_limit
 * is being enforced, this cannot overflow since the limit cannot be more
 * than INT_MAX kilobytes.  When not enforcing, it could theoretically
 * overflow, but we don't care.
 */
static uint64 temporary_files_size = 0;

/*
 * List of OS handles opened with AllocateFile, AllocateDir and
 * OpenTransientFile.
 */
typedef enum
{
    AllocateDescFile,
    AllocateDescPipe,
    AllocateDescDir,
    AllocateDescRawFD
} AllocateDescKind;

typedef struct
{
    AllocateDescKind kind;
    SubTransactionId create_subid;
    union
    {
        FILE       *file;
        DIR           *dir;
        int            fd;
    }            desc;
} AllocateDesc;

static int    numAllocatedDescs = 0;
static int    maxAllocatedDescs = 0;
static AllocateDesc *allocatedDescs = NULL;

/*
 * Number of temporary files opened during the current session;
 * this is used in generation of tempfile names.
 */
static long tempFileCounter = 0;

/*
 * Array of OIDs of temp tablespaces.  When numTempTableSpaces is -1,
 * this has not been set in the current transaction.
 */
static Oid *tempTableSpaces = NULL;
static int    numTempTableSpaces = -1;
static int    nextTempTableSpace = 0;


/*--------------------
 *
 * Private Routines
 *
 * Delete           - delete a file from the Lru ring
 * LruDelete       - remove a file from the Lru ring and close its FD
 * Insert           - put a file at the front of the Lru ring
 * LruInsert       - put a file at the front of the Lru ring and open it
 * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
 * AllocateVfd       - grab a free (or new) file record (from VfdArray)
 * FreeVfd           - free a file record
 *
 * The Least Recently Used ring is a doubly linked list that begins and
 * ends on element zero.  Element zero is special -- it doesn't represent
 * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
 * anchor that shows us the beginning/end of the ring.
 * Only VFD elements that are currently really open (have an FD assigned) are
 * in the Lru ring.  Elements that are "virtually" open can be recognized
 * by having a non-null fileName field.
 *
 * example:
 *
 *       /--less----\                   /---------\
 *       v           \              v              \
 *     #0 --more---> LeastRecentlyUsed --more-\ \
 *      ^\                                    | |
 *       \\less--> MostRecentlyUsedFile    <---/ |
 *        \more---/                     \--less--/
 *
 *--------------------
 */
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int    LruInsert(File file);
static bool ReleaseLruFile(void);
static void ReleaseLruFiles(void);
static File AllocateVfd(void);
static void FreeVfd(File file);

static int    FileAccess(File file);
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
static bool reserveAllocatedDesc(void);
static int    FreeDesc(AllocateDesc *desc);
static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);

static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname);
static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name);

static void walkdir(const char *path,
        void (*action) (const char *fname, bool isdir, int elevel),
        bool process_symlinks,
        int elevel);
#ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);

static int    fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
static int    fsync_parent_path(const char *fname, int elevel);


/*
 * pg_fsync --- do fsync with or without writethrough
 */
int
pg_fsync(int fd)
{
    /* #if is to skip the sync_method test if there's no need for it */
#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
    if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
        return pg_fsync_writethrough(fd);
    else
#endif
        return pg_fsync_no_writethrough(fd);
}


/*
 * pg_fsync_no_writethrough --- same as fsync except does nothing if
 *    enableFsync is off
 */
int
pg_fsync_no_writethrough(int fd)
{
    if (enableFsync)
        return fsync(fd);
    else
        return 0;
}

/*
 * pg_fsync_writethrough
 */
int
pg_fsync_writethrough(int fd)
{
    if (enableFsync)
    {
#ifdef WIN32
        return _commit(fd);
#elif defined(F_FULLFSYNC)
        return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
#else
        errno = ENOSYS;
        return -1;
#endif
    }
    else
        return 0;
}

/*
 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
 *
 * Not all platforms have fdatasync; treat as fsync if not available.
 */
int
pg_fdatasync(int fd)
{
    if (enableFsync)
    {
#ifdef HAVE_FDATASYNC
        return fdatasync(fd);
#else
        return fsync(fd);
#endif
    }
    else
        return 0;
}

/*
 * pg_flush_data --- advise OS that the described dirty data should be flushed
 *
 * offset of 0 with nbytes 0 means that the entire file should be flushed;
 * in this case, this function may have side-effects on the file's
 * seek position!
 */
void
pg_flush_data(int fd, off_t offset, off_t nbytes)
{// #lizard forgives
    /*
     * Right now file flushing is primarily used to avoid making later
     * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     * if fsyncs are disabled - that's a decision we might want to make
     * configurable at some point.
     */
    if (!enableFsync)
        return;

    /*
     * We compile all alternatives that are supported on the current platform,
     * to find portability problems more easily.
     */
#if defined(HAVE_SYNC_FILE_RANGE)
    {
        int            rc;

        /*
         * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
         * tells the OS that writeback for the specified blocks should be
         * started, but that we don't want to wait for completion.  Note that
         * this call might block if too much dirty data exists in the range.
         * This is the preferable method on OSs supporting it, as it works
         * reliably when available (contrast to msync()) and doesn't flush out
         * clean data (like FADV_DONTNEED).
         */
        rc = sync_file_range(fd, offset, nbytes,
                             SYNC_FILE_RANGE_WRITE);

        /* don't error out, this is just a performance optimization */
        if (rc != 0)
        {
            ereport(WARNING,
                    (errcode_for_file_access(),
                     errmsg("could not flush dirty data: %m")));
        }

        return;
    }
#endif
#if !defined(WIN32) && defined(MS_ASYNC)
    {
        void       *p;
        static int    pagesize = 0;

        /*
         * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
         * writeback. On linux it only does so if MS_SYNC is specified, but
         * then it does the writeback synchronously. Luckily all common linux
         * systems have sync_file_range().  This is preferable over
         * FADV_DONTNEED because it doesn't flush out clean data.
         *
         * We map the file (mmap()), tell the kernel to sync back the contents
         * (msync()), and then remove the mapping again (munmap()).
         */

        /* mmap() needs actual length if we want to map whole file */
        if (offset == 0 && nbytes == 0)
        {
            nbytes = lseek(fd, 0, SEEK_END);
            if (nbytes < 0)
            {
                ereport(WARNING,
                        (errcode_for_file_access(),
                         errmsg("could not determine dirty data size: %m")));
                return;
            }
        }

        /*
         * Some platforms reject partial-page mmap() attempts.  To deal with
         * that, just truncate the request to a page boundary.  If any extra
         * bytes don't get flushed, well, it's only a hint anyway.
         */

        /* fetch pagesize only once */
        if (pagesize == 0)
            pagesize = sysconf(_SC_PAGESIZE);

        /* align length to pagesize, dropping any fractional page */
        if (pagesize > 0)
            nbytes = (nbytes / pagesize) * pagesize;

        /* fractional-page request is a no-op */
        if (nbytes <= 0)
            return;

        /*
         * mmap could well fail, particularly on 32-bit platforms where there
         * may simply not be enough address space.  If so, silently fall
         * through to the next implementation.
         */
        if (nbytes <= (off_t) SSIZE_MAX)
            p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
        else
            p = MAP_FAILED;

        if (p != MAP_FAILED)
        {
            int            rc;

            rc = msync(p, (size_t) nbytes, MS_ASYNC);
            if (rc != 0)
            {
                ereport(WARNING,
                        (errcode_for_file_access(),
                         errmsg("could not flush dirty data: %m")));
                /* NB: need to fall through to munmap()! */
            }

            rc = munmap(p, (size_t) nbytes);
            if (rc != 0)
            {
                /* FATAL error because mapping would remain */
                ereport(FATAL,
                        (errcode_for_file_access(),
                         errmsg("could not munmap() while flushing data: %m")));
            }

            return;
        }
    }
#endif
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    {
        int            rc;

        /*
         * Signal the kernel that the passed in range should not be cached
         * anymore. This has the, desired, side effect of writing out dirty
         * data, and the, undesired, side effect of likely discarding useful
         * clean cached blocks.  For the latter reason this is the least
         * preferable method.
         */

        rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);

        if (rc != 0)
        {
            /* don't error out, this is just a performance optimization */
            ereport(WARNING,
                    (errcode_for_file_access(),
                     errmsg("could not flush dirty data: %m")));
        }

        return;
    }
#endif
}


/*
 * fsync_fname -- fsync a file or directory, handling errors properly
 *
 * Try to fsync a file or directory. When doing the latter, ignore errors that
 * indicate the OS just doesn't allow/require fsyncing directories.
 */
void
fsync_fname(const char *fname, bool isdir)
{
    fsync_fname_ext(fname, isdir, false, ERROR);
}

/*
 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
 *
 * This routine ensures that, after returning, the effect of renaming file
 * persists in case of a crash. A crash while this routine is running will
 * leave you with either the pre-existing or the moved file in place of the
 * new file; no mixed state or truncated files are possible.
 *
 * It does so by using fsync on the old filename and the possibly existing
 * target filename before the rename, and the target file and directory after.
 *
 * Note that rename() cannot be used across arbitrary directories, as they
 * might not be on the same filesystem. Therefore this routine does not
 * support renaming across directories.
 *
 * Log errors with the caller specified severity.
 *
 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
 * valid upon return.
 */
int
durable_rename(const char *oldfile, const char *newfile, int elevel)
{// #lizard forgives
    int            fd;

    /*
     * First fsync the old and target path (if it exists), to ensure that they
     * are properly persistent on disk. Syncing the target file is not
     * strictly necessary, but it makes it easier to reason about crashes;
     * because it's then guaranteed that either source or target file exists
     * after a crash.
     */
    if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
        return -1;

    fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
    if (fd < 0)
    {
        if (errno != ENOENT)
        {
            ereport(elevel,
                    (errcode_for_file_access(),
                     errmsg("could not open file \"%s\": %m", newfile)));
            return -1;
        }
    }
    else
    {
        if (pg_fsync(fd) != 0)
        {
            int            save_errno;

            /* close file upon error, might not be in transaction context */
            save_errno = errno;
            CloseTransientFile(fd);
            errno = save_errno;

            ereport(elevel,
                    (errcode_for_file_access(),
                     errmsg("could not fsync file \"%s\": %m", newfile)));
            return -1;
        }
        CloseTransientFile(fd);
    }

    /* Time to do the real deal... */
    if (rename(oldfile, newfile) < 0)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not rename file \"%s\" to \"%s\": %m",
                        oldfile, newfile)));
        return -1;
    }

    /*
     * To guarantee renaming the file is persistent, fsync the file with its
     * new name, and its containing directory.
     */
    if (fsync_fname_ext(newfile, false, false, elevel) != 0)
        return -1;

    if (fsync_parent_path(newfile, elevel) != 0)
        return -1;

    return 0;
}

/*
 * durable_unlink -- remove a file in a durable manner
 *
 * This routine ensures that, after returning, the effect of removing file
 * persists in case of a crash. A crash while this routine is running will
 * leave the system in no mixed state.
 *
 * It does so by using fsync on the parent directory of the file after the
 * actual removal is done.
 *
 * Log errors with the severity specified by caller.
 *
 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
 * valid upon return.
 */
int
durable_unlink(const char *fname, int elevel)
{
    if (unlink(fname) < 0)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not remove file \"%s\": %m",
                        fname)));
        return -1;
    }

    /*
     * To guarantee that the removal of the file is persistent, fsync its
     * parent directory.
     */
    if (fsync_parent_path(fname, elevel) != 0)
        return -1;

    return 0;
}

/*
 * durable_link_or_rename -- rename a file in a durable manner.
 *
 * Similar to durable_rename(), except that this routine tries (but does not
 * guarantee) not to overwrite the target file.
 *
 * Note that a crash in an unfortunate moment can leave you with two links to
 * the target file.
 *
 * Log errors with the caller specified severity.
 *
 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
 * valid upon return.
 */
int
durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
{
    /*
     * Ensure that, if we crash directly after the rename/link, a file with
     * valid contents is moved into place.
     */
    if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
        return -1;

#if HAVE_WORKING_LINK
    if (link(oldfile, newfile) < 0)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not link file \"%s\" to \"%s\": %m",
                        oldfile, newfile)));
        return -1;
    }
    unlink(oldfile);
#else
    /* XXX: Add racy file existence check? */
    if (rename(oldfile, newfile) < 0)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not rename file \"%s\" to \"%s\": %m",
                        oldfile, newfile)));
        return -1;
    }
#endif

    /*
     * Make change persistent in case of an OS crash, both the new entry and
     * its parent directory need to be flushed.
     */
    if (fsync_fname_ext(newfile, false, false, elevel) != 0)
        return -1;

    /* Same for parent directory */
    if (fsync_parent_path(newfile, elevel) != 0)
        return -1;

    return 0;
}

/*
 * InitFileAccess --- initialize this module during backend startup
 *
 * This is called during either normal or standalone backend start.
 * It is *not* called in the postmaster.
 */
void
InitFileAccess(void)
{
    Assert(SizeVfdCache == 0);    /* call me only once */

    /* initialize cache header entry */
    VfdCache = (Vfd *) malloc(sizeof(Vfd));
    if (VfdCache == NULL)
        ereport(FATAL,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
    VfdCache->fd = VFD_CLOSED;

    SizeVfdCache = 1;

    /* register proc-exit hook to ensure temp files are dropped at exit */
    on_proc_exit(AtProcExit_Files, 0);
}
#ifdef _MLS_
/*
 * vdf is needed for relcrypt hash, so the variable was inited temporary.
 */
void MlsInitFileAccess(void)
{
    Assert(SizeVfdCache == 0);    /* call me only once */

    /* initialize cache header entry */
    VfdCache = (Vfd *) malloc(sizeof(Vfd));
    if (VfdCache == NULL)
        ereport(FATAL,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
    VfdCache->fd = VFD_CLOSED;

    SizeVfdCache = 1;
}

/*
 * clean global vars inited in InitFileAccess.
 */
void MlsCleanFileAccess(void)
{   
    if (VfdCache)
    {
        free(VfdCache);
    }

    SizeVfdCache = 0;
    
    return;
}
#endif


/*
 * count_usable_fds --- count how many FDs the system will let us open,
 *        and estimate how many are already open.
 *
 * We stop counting if usable_fds reaches max_to_probe.  Note: a small
 * value of max_to_probe might result in an underestimate of already_open;
 * we must fill in any "gaps" in the set of used FDs before the calculation
 * of already_open will give the right answer.  In practice, max_to_probe
 * of a couple of dozen should be enough to ensure good results.
 *
 * We assume stdin (FD 0) is available for dup'ing
 */
static void
count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
{// #lizard forgives
    int           *fd;
    int            size;
    int            used = 0;
    int            highestfd = 0;
    int            j;

#ifdef HAVE_GETRLIMIT
    struct rlimit rlim;
    int            getrlimit_status;
#endif

    size = 1024;
    fd = (int *) palloc(size * sizeof(int));

#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_NOFILE            /* most platforms use RLIMIT_NOFILE */
    getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
#else                            /* but BSD doesn't ... */
    getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
#endif                            /* RLIMIT_NOFILE */
    if (getrlimit_status != 0)
        ereport(WARNING, (errmsg("getrlimit failed: %m")));
#endif                            /* HAVE_GETRLIMIT */

    /* dup until failure or probe limit reached */
    for (;;)
    {
        int            thisfd;

#ifdef HAVE_GETRLIMIT

        /*
         * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
         * some platforms
         */
        if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
            break;
#endif

        thisfd = dup(0);
        if (thisfd < 0)
        {
            /* Expect EMFILE or ENFILE, else it's fishy */
            if (errno != EMFILE && errno != ENFILE)
                elog(WARNING, "dup(0) failed after %d successes: %m", used);
            break;
        }

        if (used >= size)
        {
            size *= 2;
            fd = (int *) repalloc(fd, size * sizeof(int));
        }
        fd[used++] = thisfd;

        if (highestfd < thisfd)
            highestfd = thisfd;

        if (used >= max_to_probe)
            break;
    }

    /* release the files we opened */
    for (j = 0; j < used; j++)
        close(fd[j]);

    pfree(fd);

    /*
     * Return results.  usable_fds is just the number of successful dups. We
     * assume that the system limit is highestfd+1 (remember 0 is a legal FD
     * number) and so already_open is highestfd+1 - usable_fds.
     */
    *usable_fds = used;
    *already_open = highestfd + 1 - used;
}

/*
 * set_max_safe_fds
 *        Determine number of filedescriptors that fd.c is allowed to use
 */
void
set_max_safe_fds(void)
{
    int            usable_fds;
    int            already_open;

    /*----------
     * We want to set max_safe_fds to
     *            MIN(usable_fds, max_files_per_process - already_open)
     * less the slop factor for files that are opened without consulting
     * fd.c.  This ensures that we won't exceed either max_files_per_process
     * or the experimentally-determined EMFILE limit.
     *----------
     */
    count_usable_fds(max_files_per_process,
                     &usable_fds, &already_open);

    max_safe_fds = Min(usable_fds, max_files_per_process - already_open);

    /*
     * Take off the FDs reserved for system() etc.
     */
    max_safe_fds -= NUM_RESERVED_FDS;

    /*
     * Make sure we still have enough to get by.
     */
    if (max_safe_fds < FD_MINFREE)
        ereport(FATAL,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("insufficient file descriptors available to start server process"),
                 errdetail("System allows %d, we need at least %d.",
                           max_safe_fds + NUM_RESERVED_FDS,
                           FD_MINFREE + NUM_RESERVED_FDS)));

    elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
         max_safe_fds, usable_fds, already_open);
}

/*
 * BasicOpenFile --- same as open(2) except can free other FDs if needed
 *
 * This is exported for use by places that really want a plain kernel FD,
 * but need to be proof against running out of FDs.  Once an FD has been
 * successfully returned, it is the caller's responsibility to ensure that
 * it will not be leaked on ereport()!    Most users should *not* call this
 * routine directly, but instead use the VFD abstraction level, which
 * provides protection against descriptor leaks as well as management of
 * files that need to be open for more than a short period of time.
 *
 * Ideally this should be the *only* direct call of open() in the backend.
 * In practice, the postmaster calls open() directly, and there are some
 * direct open() calls done early in backend startup.  Those are OK since
 * this module wouldn't have any open files to close at that point anyway.
 */
int
BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
{
    int            fd;

tryAgain:
    fd = open(fileName, fileFlags, fileMode);

    if (fd >= 0)
        return fd;                /* success! */

    if (errno == EMFILE || errno == ENFILE)
    {
        int            save_errno = errno;

        ereport(LOG,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("out of file descriptors: %m; release and retry")));
        errno = 0;
        if (ReleaseLruFile())
            goto tryAgain;
        errno = save_errno;
    }

    return -1;                    /* failure */
}

#if defined(FDDEBUG)

static void
_dump_lru(void)
{
    int            mru = VfdCache[0].lruLessRecently;
    Vfd           *vfdP = &VfdCache[mru];
    char        buf[2048];

    snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
    while (mru != 0)
    {
        mru = vfdP->lruLessRecently;
        vfdP = &VfdCache[mru];
        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
    }
    snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
    elog(LOG, "%s", buf);
}
#endif                            /* FDDEBUG */

static void
Delete(File file)
{
    Vfd           *vfdP;

    Assert(file != 0);

    DO_DB(elog(LOG, "Delete %d (%s)",
               file, VfdCache[file].fileName));
    DO_DB(_dump_lru());

    vfdP = &VfdCache[file];

    VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
    VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;

    DO_DB(_dump_lru());
}

static void
LruDelete(File file)
{
    Vfd           *vfdP;

    Assert(file != 0);

    DO_DB(elog(LOG, "LruDelete %d (%s)",
               file, VfdCache[file].fileName));

    vfdP = &VfdCache[file];

    /*
     * Normally we should know the seek position, but if for some reason we
     * have lost track of it, try again to get it.  If we still can't get it,
     * we have a problem: we will be unable to restore the file seek position
     * when and if the file is re-opened.  But we can't really throw an error
     * and refuse to close the file, or activities such as transaction cleanup
     * will be broken.
     */
    if (FilePosIsUnknown(vfdP->seekPos))
    {
        vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
        if (FilePosIsUnknown(vfdP->seekPos))
            elog(LOG, "could not seek file \"%s\" before closing: %m",
                 vfdP->fileName);
    }

    /*
     * Close the file.  We aren't expecting this to fail; if it does, better
     * to leak the FD than to mess up our internal state.
     */
    if (close(vfdP->fd))
        elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
    vfdP->fd = VFD_CLOSED;
    --nfile;

    /* delete the vfd record from the LRU ring */
    Delete(file);
}

static void
Insert(File file)
{
    Vfd           *vfdP;

    Assert(file != 0);

    DO_DB(elog(LOG, "Insert %d (%s)",
               file, VfdCache[file].fileName));
    DO_DB(_dump_lru());

    vfdP = &VfdCache[file];

    vfdP->lruMoreRecently = 0;
    vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    VfdCache[0].lruLessRecently = file;
    VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;

    DO_DB(_dump_lru());
}

/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
    Vfd           *vfdP;

    Assert(file != 0);

    DO_DB(elog(LOG, "LruInsert %d (%s)",
               file, VfdCache[file].fileName));

    vfdP = &VfdCache[file];

    if (FileIsNotOpen(file))
    {
        /* Close excess kernel FDs. */
        ReleaseLruFiles();

        /*
         * The open could still fail for lack of file descriptors, eg due to
         * overall system file table being full.  So, be prepared to release
         * another FD if necessary...
         */
        vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
                                 vfdP->fileMode);
        if (vfdP->fd < 0)
        {
            DO_DB(elog(LOG, "re-open failed: %m"));
            return -1;
        }
        else
        {
            ++nfile;
        }

        /*
         * Seek to the right position.  We need no special case for seekPos
         * equal to FileUnknownPos, as lseek() will certainly reject that
         * (thus completing the logic noted in LruDelete() that we will fail
         * to re-open a file if we couldn't get its seek position before
         * closing).
         */
        if (vfdP->seekPos != (off_t) 0)
        {
            if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
            {
                /*
                 * If we fail to restore the seek position, treat it like an
                 * open() failure.
                 */
                int            save_errno = errno;

                elog(LOG, "could not seek file \"%s\" after re-opening: %m",
                     vfdP->fileName);
                (void) close(vfdP->fd);
                vfdP->fd = VFD_CLOSED;
                --nfile;
                errno = save_errno;
                return -1;
            }
        }
    }

    /*
     * put it at the head of the Lru ring
     */

    Insert(file);

    return 0;
}

/*
 * Release one kernel FD by closing the least-recently-used VFD.
 */
static bool
ReleaseLruFile(void)
{
    DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));

    if (nfile > 0)
    {
        /*
         * There are opened files and so there should be at least one used vfd
         * in the ring.
         */
        Assert(VfdCache[0].lruMoreRecently != 0);
        LruDelete(VfdCache[0].lruMoreRecently);
        return true;            /* freed a file */
    }
    return false;                /* no files available to free */
}

/*
 * Release kernel FDs as needed to get under the max_safe_fds limit.
 * After calling this, it's OK to try to open another file.
 */
static void
ReleaseLruFiles(void)
{
    while (nfile + numAllocatedDescs >= max_safe_fds)
    {
        if (!ReleaseLruFile())
            break;
    }
}

static File
AllocateVfd(void)
{
    Index        i;
    File        file;

    DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));

    Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */

    if (VfdCache[0].nextFree == 0)
    {
        /*
         * The free list is empty so it is time to increase the size of the
         * array.  We choose to double it each time this happens. However,
         * there's not much point in starting *real* small.
         */
        Size        newCacheSize = SizeVfdCache * 2;
        Vfd           *newVfdCache;

        if (newCacheSize < 32)
            newCacheSize = 32;

        /*
         * Be careful not to clobber VfdCache ptr if realloc fails.
         */
        newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
        if (newVfdCache == NULL)
            ereport(ERROR,
                    (errcode(ERRCODE_OUT_OF_MEMORY),
                     errmsg("out of memory")));
        VfdCache = newVfdCache;

        /*
         * Initialize the new entries and link them into the free list.
         */
        for (i = SizeVfdCache; i < newCacheSize; i++)
        {
            MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
            VfdCache[i].nextFree = i + 1;
            VfdCache[i].fd = VFD_CLOSED;
        }
        VfdCache[newCacheSize - 1].nextFree = 0;
        VfdCache[0].nextFree = SizeVfdCache;

        /*
         * Record the new size
         */
        SizeVfdCache = newCacheSize;
    }

    file = VfdCache[0].nextFree;

    VfdCache[0].nextFree = VfdCache[file].nextFree;

    return file;
}

static void
FreeVfd(File file)
{
    Vfd           *vfdP = &VfdCache[file];

    DO_DB(elog(LOG, "FreeVfd: %d (%s)",
               file, vfdP->fileName ? vfdP->fileName : ""));

    if (vfdP->fileName != NULL)
    {
        free(vfdP->fileName);
        vfdP->fileName = NULL;
    }
    vfdP->fdstate = 0x0;

    vfdP->nextFree = VfdCache[0].nextFree;
    VfdCache[0].nextFree = file;
}

/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
    int            returnValue;

    DO_DB(elog(LOG, "FileAccess %d (%s)",
               file, VfdCache[file].fileName));

    /*
     * Is the file open?  If not, open it and put it at the head of the LRU
     * ring (possibly closing the least recently used file to get an FD).
     */

    if (FileIsNotOpen(file))
    {
        returnValue = LruInsert(file);
        if (returnValue != 0)
            return returnValue;
    }
    else if (VfdCache[0].lruLessRecently != file)
    {
        /*
         * We now know that the file is open and that it is not the last one
         * accessed, so we need to move it to the head of the Lru ring.
         */

        Delete(file);
        Insert(file);
    }

    return 0;
}

/*
 *    Called when we get a shared invalidation message on some relation.
 */
#ifdef NOT_USED
void
FileInvalidate(File file)
{
    Assert(FileIsValid(file));
    if (!FileIsNotOpen(file))
        LruDelete(file);
}
#endif

/*
 * open a file in an arbitrary directory
 *
 * NB: if the passed pathname is relative (which it usually is),
 * it will be interpreted relative to the process' working directory
 * (which should always be $PGDATA when this code is running).
 */
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
    char       *fnamecopy;
    File        file;
    Vfd           *vfdP;

    DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
               fileName, fileFlags, fileMode));

    /*
     * We need a malloc'd copy of the file name; fail cleanly if no room.
     */
    fnamecopy = strdup(fileName);
    if (fnamecopy == NULL)
        ereport(ERROR,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    file = AllocateVfd();
    vfdP = &VfdCache[file];

    /* Close excess kernel FDs. */
    ReleaseLruFiles();

    vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);

    if (vfdP->fd < 0)
    {
        int            save_errno = errno;

        FreeVfd(file);
        free(fnamecopy);
        errno = save_errno;
        return -1;
    }
    ++nfile;
    DO_DB(elog(LOG, "PathNameOpenFile: success %d",
               vfdP->fd));

    Insert(file);

    vfdP->fileName = fnamecopy;
    /* Saved flags are adjusted to be OK for re-opening file */
    vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    vfdP->fileMode = fileMode;
    vfdP->seekPos = 0;
    vfdP->fileSize = 0;
    vfdP->fdstate = 0x0;
    vfdP->resowner = NULL;

    return file;
}

/*
 * Open a temporary file that will disappear when we close it.
 *
 * This routine takes care of generating an appropriate tempfile name.
 * There's no need to pass in fileFlags or fileMode either, since only
 * one setting makes any sense for a temp file.
 *
 * Unless interXact is true, the file is remembered by CurrentResourceOwner
 * to ensure it's closed and deleted when it's no longer needed, typically at
 * the end-of-transaction. In most cases, you don't want temporary files to
 * outlive the transaction that created them, so this should be false -- but
 * if you need "somewhat" temporary storage, this might be useful. In either
 * case, the file is removed when the File is explicitly closed.
 */
File
OpenTemporaryFile(bool interXact)
{
    File        file = 0;

    /*
     * If some temp tablespace(s) have been given to us, try to use the next
     * one.  If a given tablespace can't be found, we silently fall back to
     * the database's default tablespace.
     *
     * BUT: if the temp file is slated to outlive the current transaction,
     * force it into the database's default tablespace, so that it will not
     * pose a threat to possible tablespace drop attempts.
     */
    if (numTempTableSpaces > 0 && !interXact)
    {
        Oid            tblspcOid = GetNextTempTableSpace();

        if (OidIsValid(tblspcOid))
            file = OpenTemporaryFileInTablespace(tblspcOid, false);
    }

    /*
     * If not, or if tablespace is bad, create in database's default
     * tablespace.  MyDatabaseTableSpace should normally be set before we get
     * here, but just in case it isn't, fall back to pg_default tablespace.
     */
    if (file <= 0)
        file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
                                             MyDatabaseTableSpace :
                                             DEFAULTTABLESPACE_OID,
                                             true);

    /* Mark it for deletion at close */
    VfdCache[file].fdstate |= FD_TEMPORARY;

    /* Register it with the current resource owner */
    if (!interXact)
    {
        VfdCache[file].fdstate |= FD_XACT_TEMPORARY;

        ResourceOwnerEnlargeFiles(CurrentResourceOwner);
        ResourceOwnerRememberFile(CurrentResourceOwner, file);
        VfdCache[file].resowner = CurrentResourceOwner;

        /* ensure cleanup happens at eoxact */
        have_xact_temporary_files = true;
    }

    return file;
}

/*
 * Open a temporary file in a specific tablespace.
 * Subroutine for OpenTemporaryFile, which see for details.
 */
static File
OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
{
    char        tempdirpath[MAXPGPATH];
    char        tempfilepath[MAXPGPATH];
    File        file;

    /*
     * Identify the tempfile directory for this tablespace.
     *
     * If someone tries to specify pg_global, use pg_default instead.
     */
    if (tblspcOid == DEFAULTTABLESPACE_OID ||
        tblspcOid == GLOBALTABLESPACE_OID)
    {
        /* The default tablespace is {datadir}/base */
        snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
                 PG_TEMP_FILES_DIR);
    }
    else
    {
        /* All other tablespaces are accessed via symlinks */
#ifdef PGXC
        /* Postgres-XC tablespaces include node name in path */
        snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s_%s/%s",
                 tblspcOid, TABLESPACE_VERSION_DIRECTORY, PGXCNodeName, PG_TEMP_FILES_DIR);
#else
        snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
                 tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
#endif
    }

    /*
     * Generate a tempfile name that should be unique within the current
     * database instance.
     */
    snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
             tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);

    /*
     * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
     * temp file that can be reused.
     */
    file = PathNameOpenFile(tempfilepath,
                            O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
                            0600);
    if (file <= 0)
    {
        /*
         * We might need to create the tablespace's tempfile directory, if no
         * one has yet done so.
         *
         * Don't check for error from mkdir; it could fail if someone else
         * just did the same thing.  If it doesn't work then we'll bomb out on
         * the second create attempt, instead.
         */
        (void) mkdir(tempdirpath, S_IRWXU);

        file = PathNameOpenFile(tempfilepath,
                                O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
                                0600);
        if (file <= 0 && rejectError)
            elog(ERROR, "could not create temporary file \"%s\": %m",
                 tempfilepath);
    }

    return file;
}

/*
 * close a file when done with it
 */
void
FileClose(File file)
{// #lizard forgives
    Vfd           *vfdP;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileClose: %d (%s)",
               file, VfdCache[file].fileName));

    vfdP = &VfdCache[file];

    if (!FileIsNotOpen(file))
    {
        /* close the file */
        if (close(vfdP->fd))
            elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);

        --nfile;
        vfdP->fd = VFD_CLOSED;

        /* remove the file from the lru ring */
        Delete(file);
    }

    /*
     * Delete the file if it was temporary, and make a log entry if wanted
     */
    if (vfdP->fdstate & FD_TEMPORARY)
    {
        struct stat filestats;
        int            stat_errno;

        /*
         * If we get an error, as could happen within the ereport/elog calls,
         * we'll come right back here during transaction abort.  Reset the
         * flag to ensure that we can't get into an infinite loop.  This code
         * is arranged to ensure that the worst-case consequence is failing to
         * emit log message(s), not failing to attempt the unlink.
         */
        vfdP->fdstate &= ~FD_TEMPORARY;

        /* Subtract its size from current usage (do first in case of error) */
        temporary_files_size -= vfdP->fileSize;
        vfdP->fileSize = 0;

        /* first try the stat() */
        if (stat(vfdP->fileName, &filestats))
            stat_errno = errno;
        else
            stat_errno = 0;

        /* in any case do the unlink */
        if (unlink(vfdP->fileName))
            elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);

        /* and last report the stat results */
        if (stat_errno == 0)
        {
            pgstat_report_tempfile(filestats.st_size);

            if (log_temp_files >= 0)
            {
                if ((filestats.st_size / 1024) >= log_temp_files)
                    ereport(LOG,
                            (errmsg("temporary file: path \"%s\", size %lu",
                                    vfdP->fileName,
                                    (unsigned long) filestats.st_size)));
            }
        }
        else
        {
            errno = stat_errno;
            elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
        }
    }

    /* Unregister it from the resource owner */
    if (vfdP->resowner)
        ResourceOwnerForgetFile(vfdP->resowner, file);

    /*
     * Return the Vfd slot to the free list
     */
    FreeVfd(file);
}

/*
 * FilePrefetch - initiate asynchronous read of a given range of the file.
 * The logical seek position is unaffected.
 *
 * Currently the only implementation of this function is using posix_fadvise
 * which is the simplest standardized interface that accomplishes this.
 * We could add an implementation using libaio in the future; but note that
 * this API is inappropriate for libaio, which wants to have a buffer provided
 * to read into.
 */
int
FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
{
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    int            returnCode;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
               file, VfdCache[file].fileName,
               (int64) offset, amount));

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return returnCode;

    pgstat_report_wait_start(wait_event_info);
    returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
                               POSIX_FADV_WILLNEED);
    pgstat_report_wait_end();

    return returnCode;
#else
    Assert(FileIsValid(file));
    return 0;
#endif
}

void
FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
{
    int            returnCode;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
               file, VfdCache[file].fileName,
               (int64) offset, (int64) nbytes));

    /*
     * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
     * file's seek position.  We prefer to define that as a no-op here.
     */
    if (nbytes <= 0)
        return;

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return;

    pgstat_report_wait_start(wait_event_info);
    pg_flush_data(VfdCache[file].fd, offset, nbytes);
    pgstat_report_wait_end();
}

int
FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
{
    int            returnCode;
    Vfd           *vfdP;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
               file, VfdCache[file].fileName,
               (int64) VfdCache[file].seekPos,
               amount, buffer));

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return returnCode;

    vfdP = &VfdCache[file];

retry:
    pgstat_report_wait_start(wait_event_info);
    returnCode = read(vfdP->fd, buffer, amount);
    pgstat_report_wait_end();

    if (returnCode >= 0)
    {
        /* if seekPos is unknown, leave it that way */
        if (!FilePosIsUnknown(vfdP->seekPos))
            vfdP->seekPos += returnCode;
    }
    else
    {
        /*
         * Windows may run out of kernel buffers and return "Insufficient
         * system resources" error.  Wait a bit and retry to solve it.
         *
         * It is rumored that EINTR is also possible on some Unix filesystems,
         * in which case immediate retry is indicated.
         */
#ifdef WIN32
        DWORD        error = GetLastError();

        switch (error)
        {
            case ERROR_NO_SYSTEM_RESOURCES:
                pg_usleep(1000L);
                errno = EINTR;
                break;
            default:
                _dosmaperr(error);
                break;
        }
#endif
        /* OK to retry if interrupted */
        if (errno == EINTR)
            goto retry;

        /* Trouble, so assume we don't know the file position anymore */
        vfdP->seekPos = FileUnknownPos;
    }

    return returnCode;
}

int
FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
{// #lizard forgives
    int            returnCode;
    Vfd           *vfdP;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
               file, VfdCache[file].fileName,
               (int64) VfdCache[file].seekPos,
               amount, buffer));

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return returnCode;

    vfdP = &VfdCache[file];

    /*
     * If enforcing temp_file_limit and it's a temp file, check to see if the
     * write would overrun temp_file_limit, and throw error if so.  Note: it's
     * really a modularity violation to throw error here; we should set errno
     * and return -1.  However, there's no way to report a suitable error
     * message if we do that.  All current callers would just throw error
     * immediately anyway, so this is safe at present.
     */
    if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
    {
        off_t        newPos;

        /*
         * Normally we should know the seek position, but if for some reason
         * we have lost track of it, try again to get it.  Here, it's fine to
         * throw an error if we still can't get it.
         */
        if (FilePosIsUnknown(vfdP->seekPos))
        {
            vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
            if (FilePosIsUnknown(vfdP->seekPos))
                elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
        }

        newPos = vfdP->seekPos + amount;
        if (newPos > vfdP->fileSize)
        {
            uint64        newTotal = temporary_files_size;

            newTotal += newPos - vfdP->fileSize;
            if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
                ereport(ERROR,
                        (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
                         errmsg("temporary file size exceeds temp_file_limit (%dkB)",
                                temp_file_limit)));
        }
    }

retry:
    errno = 0;
    pgstat_report_wait_start(wait_event_info);
    returnCode = write(vfdP->fd, buffer, amount);
    pgstat_report_wait_end();

    /* if write didn't set errno, assume problem is no disk space */
    if (returnCode != amount && errno == 0)
        errno = ENOSPC;

    if (returnCode >= 0)
    {
        /* if seekPos is unknown, leave it that way */
        if (!FilePosIsUnknown(vfdP->seekPos))
            vfdP->seekPos += returnCode;

        /*
         * Maintain fileSize and temporary_files_size if it's a temp file.
         *
         * If seekPos is -1 (unknown), this will do nothing; but we could only
         * get here in that state if we're not enforcing temporary_files_size,
         * so we don't care.
         */
        if (vfdP->fdstate & FD_TEMPORARY)
        {
            off_t        newPos = vfdP->seekPos;

            if (newPos > vfdP->fileSize)
            {
                temporary_files_size += newPos - vfdP->fileSize;
                vfdP->fileSize = newPos;
            }
        }
    }
    else
    {
        /*
         * See comments in FileRead()
         */
#ifdef WIN32
        DWORD        error = GetLastError();

        switch (error)
        {
            case ERROR_NO_SYSTEM_RESOURCES:
                pg_usleep(1000L);
                errno = EINTR;
                break;
            default:
                _dosmaperr(error);
                break;
        }
#endif
        /* OK to retry if interrupted */
        if (errno == EINTR)
            goto retry;

        /* Trouble, so assume we don't know the file position anymore */
        vfdP->seekPos = FileUnknownPos;
    }

    return returnCode;
}

int
FileSync(File file, uint32 wait_event_info)
{
    int            returnCode;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileSync: %d (%s)",
               file, VfdCache[file].fileName));

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return returnCode;

    pgstat_report_wait_start(wait_event_info);
    returnCode = pg_fsync(VfdCache[file].fd);
    pgstat_report_wait_end();

    return returnCode;
}

off_t
FileSeek(File file, off_t offset, int whence)
{// #lizard forgives
    Vfd           *vfdP;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
               file, VfdCache[file].fileName,
               (int64) VfdCache[file].seekPos,
               (int64) offset, whence));

    vfdP = &VfdCache[file];

    if (FileIsNotOpen(file))
    {
        switch (whence)
        {
            case SEEK_SET:
                if (offset < 0)
                {
                    errno = EINVAL;
                    return (off_t) -1;
                }
                vfdP->seekPos = offset;
                break;
            case SEEK_CUR:
                if (FilePosIsUnknown(vfdP->seekPos) ||
                    vfdP->seekPos + offset < 0)
                {
                    errno = EINVAL;
                    return (off_t) -1;
                }
                vfdP->seekPos += offset;
                break;
            case SEEK_END:
                if (FileAccess(file) < 0)
                    return (off_t) -1;
                vfdP->seekPos = lseek(vfdP->fd, offset, whence);
                break;
            default:
                elog(ERROR, "invalid whence: %d", whence);
                break;
        }
    }
    else
    {
        switch (whence)
        {
            case SEEK_SET:
                if (offset < 0)
                {
                    errno = EINVAL;
                    return (off_t) -1;
                }
                if (vfdP->seekPos != offset)
                    vfdP->seekPos = lseek(vfdP->fd, offset, whence);
                break;
            case SEEK_CUR:
                if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
                    vfdP->seekPos = lseek(vfdP->fd, offset, whence);
                break;
            case SEEK_END:
                vfdP->seekPos = lseek(vfdP->fd, offset, whence);
                break;
            default:
                elog(ERROR, "invalid whence: %d", whence);
                break;
        }
    }

    return vfdP->seekPos;
}

/*
 * XXX not actually used but here for completeness
 */
#ifdef NOT_USED
off_t
FileTell(File file)
{
    Assert(FileIsValid(file));
    DO_DB(elog(LOG, "FileTell %d (%s)",
               file, VfdCache[file].fileName));
    return VfdCache[file].seekPos;
}
#endif

int
FileTruncate(File file, off_t offset, uint32 wait_event_info)
{
    int            returnCode;

    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileTruncate %d (%s)",
               file, VfdCache[file].fileName));

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return returnCode;

    pgstat_report_wait_start(wait_event_info);
    returnCode = ftruncate(VfdCache[file].fd, offset);
    pgstat_report_wait_end();

    if (returnCode == 0 && VfdCache[file].fileSize > offset)
    {
        /* adjust our state for truncation of a temp file */
        Assert(VfdCache[file].fdstate & FD_TEMPORARY);
        temporary_files_size -= VfdCache[file].fileSize - offset;
        VfdCache[file].fileSize = offset;
    }

    return returnCode;
}

#ifdef _SHARDING_
int
FileDealloc(File file, off_t offset, uint32 len,  uint32 wait_event_info)
{
#ifndef DISABLE_FALLOCATE
    int            returnCode;
    
    Assert(FileIsValid(file));

    DO_DB(elog(LOG, "FileDealloc %d (%s)",
               file, VfdCache[file].fileName));

    returnCode = FileAccess(file);
    if (returnCode < 0)
        return returnCode;

    pgstat_report_wait_start(wait_event_info);
    returnCode = fallocate(VfdCache[file].fd,  
                        FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 
                        offset, 
                        len);
    
    pgstat_report_wait_end();

    if(0 != returnCode)
    {
        elog(ERROR, "dealloc file %s of length %x from %x failed. errmsg:%s.",
                    VfdCache[file].fileName,
                    len,
                    (uint32)offset,
                    (char *)strerror(errno));
    }
    return returnCode;
#else
    return 0;
#endif
}
#endif

/*
 * Return the pathname associated with an open file.
 *
 * The returned string points to an internal buffer, which is valid until
 * the file is closed.
 */
char *
FilePathName(File file)
{
    Assert(FileIsValid(file));

    return VfdCache[file].fileName;
}

/*
 * Return the raw file descriptor of an opened file.
 *
 * The returned file descriptor will be valid until the file is closed, but
 * there are a lot of things that can make that happen.  So the caller should
 * be careful not to do much of anything else before it finishes using the
 * returned file descriptor.
 */
int
FileGetRawDesc(File file)
{
    Assert(FileIsValid(file));
    return VfdCache[file].fd;
}

/*
 * FileGetRawFlags - returns the file flags on open(2)
 */
int
FileGetRawFlags(File file)
{
    Assert(FileIsValid(file));
    return VfdCache[file].fileFlags;
}

/*
 * FileGetRawMode - returns the mode bitmask passed to open(2)
 */
int
FileGetRawMode(File file)
{
    Assert(FileIsValid(file));
    return VfdCache[file].fileMode;
}

/*
 * Make room for another allocatedDescs[] array entry if needed and possible.
 * Returns true if an array element is available.
 */
static bool
reserveAllocatedDesc(void)
{
    AllocateDesc *newDescs;
    int            newMax;

    /* Quick out if array already has a free slot. */
    if (numAllocatedDescs < maxAllocatedDescs)
        return true;

    /*
     * If the array hasn't yet been created in the current process, initialize
     * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
     * we will ever need, anyway.  We don't want to look at max_safe_fds
     * immediately because set_max_safe_fds() may not have run yet.
     */
    if (allocatedDescs == NULL)
    {
        newMax = FD_MINFREE / 2;
        newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
        /* Out of memory already?  Treat as fatal error. */
        if (newDescs == NULL)
            ereport(ERROR,
                    (errcode(ERRCODE_OUT_OF_MEMORY),
                     errmsg("out of memory")));
        allocatedDescs = newDescs;
        maxAllocatedDescs = newMax;
        return true;
    }

    /*
     * Consider enlarging the array beyond the initial allocation used above.
     * By the time this happens, max_safe_fds should be known accurately.
     *
     * We mustn't let allocated descriptors hog all the available FDs, and in
     * practice we'd better leave a reasonable number of FDs for VFD use.  So
     * set the maximum to max_safe_fds / 2.  (This should certainly be at
     * least as large as the initial size, FD_MINFREE / 2.)
     */
    newMax = max_safe_fds / 2;
    if (newMax > maxAllocatedDescs)
    {
        newDescs = (AllocateDesc *) realloc(allocatedDescs,
                                            newMax * sizeof(AllocateDesc));
        /* Treat out-of-memory as a non-fatal error. */
        if (newDescs == NULL)
            return false;
        allocatedDescs = newDescs;
        maxAllocatedDescs = newMax;
        return true;
    }

    /* Can't enlarge allocatedDescs[] any more. */
    return false;
}

/*
 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
 * rather than plain fopen().  This lets fd.c deal with freeing FDs if
 * necessary to open the file.  When done, call FreeFile rather than fclose.
 *
 * Note that files that will be open for any significant length of time
 * should NOT be handled this way, since they cannot share kernel file
 * descriptors with other files; there is grave risk of running out of FDs
 * if anyone locks down too many FDs.  Most callers of this routine are
 * simply reading a config file that they will read and close immediately.
 *
 * fd.c will automatically close all files opened with AllocateFile at
 * transaction commit or abort; this prevents FD leakage if a routine
 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
 *
 * Ideally this should be the *only* direct call of fopen() in the backend.
 */
FILE *
AllocateFile(const char *name, const char *mode)
{
    FILE       *file;

    DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
               numAllocatedDescs, name));

    /* Can we allocate another non-virtual FD? */
    if (!reserveAllocatedDesc())
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
                        maxAllocatedDescs, name)));

    /* Close excess kernel FDs. */
    ReleaseLruFiles();

TryAgain:
    if ((file = fopen(name, mode)) != NULL)
    {
        AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];

        desc->kind = AllocateDescFile;
        desc->desc.file = file;
        desc->create_subid = GetCurrentSubTransactionId();
        numAllocatedDescs++;
        return desc->desc.file;
    }

    if (errno == EMFILE || errno == ENFILE)
    {
        int            save_errno = errno;

        ereport(LOG,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("out of file descriptors: %m; release and retry")));
        errno = 0;
        if (ReleaseLruFile())
            goto TryAgain;
        errno = save_errno;
    }

    return NULL;
}


/*
 * Like AllocateFile, but returns an unbuffered fd like open(2)
 */
int
OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
{
    int            fd;

    DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
               numAllocatedDescs, fileName));

    /* Can we allocate another non-virtual FD? */
    if (!reserveAllocatedDesc())
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
                        maxAllocatedDescs, fileName)));

    /* Close excess kernel FDs. */
    ReleaseLruFiles();

    fd = BasicOpenFile(fileName, fileFlags, fileMode);

    if (fd >= 0)
    {
        AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];

        desc->kind = AllocateDescRawFD;
        desc->desc.fd = fd;
        desc->create_subid = GetCurrentSubTransactionId();
        numAllocatedDescs++;

        return fd;
    }

    return -1;                    /* failure */
}

/*
 * Routines that want to initiate a pipe stream should use OpenPipeStream
 * rather than plain popen().  This lets fd.c deal with freeing FDs if
 * necessary.  When done, call ClosePipeStream rather than pclose.
 */
FILE *
OpenPipeStream(const char *command, const char *mode)
{
    FILE       *file;

    DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
               numAllocatedDescs, command));

    /* Can we allocate another non-virtual FD? */
    if (!reserveAllocatedDesc())
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
                        maxAllocatedDescs, command)));

    /* Close excess kernel FDs. */
    ReleaseLruFiles();

TryAgain:
    fflush(stdout);
    fflush(stderr);
    errno = 0;
    if ((file = popen(command, mode)) != NULL)
    {
        AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];

        desc->kind = AllocateDescPipe;
        desc->desc.file = file;
        desc->create_subid = GetCurrentSubTransactionId();
        numAllocatedDescs++;
        return desc->desc.file;
    }

    if (errno == EMFILE || errno == ENFILE)
    {
        int            save_errno = errno;

        ereport(LOG,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("out of file descriptors: %m; release and retry")));
        errno = 0;
        if (ReleaseLruFile())
            goto TryAgain;
        errno = save_errno;
    }

    return NULL;
}

/*
 * Free an AllocateDesc of any type.
 *
 * The argument *must* point into the allocatedDescs[] array.
 */
static int
FreeDesc(AllocateDesc *desc)
{
    int            result;

    /* Close the underlying object */
    switch (desc->kind)
    {
        case AllocateDescFile:
            result = fclose(desc->desc.file);
            break;
        case AllocateDescPipe:
            result = pclose(desc->desc.file);
            break;
        case AllocateDescDir:
            result = closedir(desc->desc.dir);
            break;
        case AllocateDescRawFD:
            result = close(desc->desc.fd);
            break;
        default:
            elog(ERROR, "AllocateDesc kind not recognized");
            result = 0;            /* keep compiler quiet */
            break;
    }

    /* Compact storage in the allocatedDescs array */
    numAllocatedDescs--;
    *desc = allocatedDescs[numAllocatedDescs];

    return result;
}

/*
 * Close a file returned by AllocateFile.
 *
 * Note we do not check fclose's return value --- it is up to the caller
 * to handle close errors.
 */
int
FreeFile(FILE *file)
{
    int            i;

    DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));

    /* Remove file from list of allocated files, if it's present */
    for (i = numAllocatedDescs; --i >= 0;)
    {
        AllocateDesc *desc = &allocatedDescs[i];

        if (desc->kind == AllocateDescFile && desc->desc.file == file)
            return FreeDesc(desc);
    }

    /* Only get here if someone passes us a file not in allocatedDescs */
    elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");

    return fclose(file);
}

/*
 * Close a file returned by OpenTransientFile.
 *
 * Note we do not check close's return value --- it is up to the caller
 * to handle close errors.
 */
int
CloseTransientFile(int fd)
{
    int            i;

    DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));

    /* Remove fd from list of allocated files, if it's present */
    for (i = numAllocatedDescs; --i >= 0;)
    {
        AllocateDesc *desc = &allocatedDescs[i];

        if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
            return FreeDesc(desc);
    }

    /* Only get here if someone passes us a file not in allocatedDescs */
    elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");

    return close(fd);
}

/*
 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
 * rather than plain opendir().  This lets fd.c deal with freeing FDs if
 * necessary to open the directory, and with closing it after an elog.
 * When done, call FreeDir rather than closedir.
 *
 * Ideally this should be the *only* direct call of opendir() in the backend.
 */
DIR *
AllocateDir(const char *dirname)
{
    DIR           *dir;

    DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
               numAllocatedDescs, dirname));

    /* Can we allocate another non-virtual FD? */
    if (!reserveAllocatedDesc())
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
                        maxAllocatedDescs, dirname)));

    /* Close excess kernel FDs. */
    ReleaseLruFiles();

TryAgain:
    if ((dir = opendir(dirname)) != NULL)
    {
        AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];

        desc->kind = AllocateDescDir;
        desc->desc.dir = dir;
        desc->create_subid = GetCurrentSubTransactionId();
        numAllocatedDescs++;
        return desc->desc.dir;
    }

    if (errno == EMFILE || errno == ENFILE)
    {
        int            save_errno = errno;

        ereport(LOG,
                (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                 errmsg("out of file descriptors: %m; release and retry")));
        errno = 0;
        if (ReleaseLruFile())
            goto TryAgain;
        errno = save_errno;
    }

    return NULL;
}

/*
 * Read a directory opened with AllocateDir, ereport'ing any error.
 *
 * This is easier to use than raw readdir() since it takes care of some
 * otherwise rather tedious and error-prone manipulation of errno.  Also,
 * if you are happy with a generic error message for AllocateDir failure,
 * you can just do
 *
 *        dir = AllocateDir(path);
 *        while ((dirent = ReadDir(dir, path)) != NULL)
 *            process dirent;
 *        FreeDir(dir);
 *
 * since a NULL dir parameter is taken as indicating AllocateDir failed.
 * (Make sure errno hasn't been changed since AllocateDir if you use this
 * shortcut.)
 *
 * The pathname passed to AllocateDir must be passed to this routine too,
 * but it is only used for error reporting.
 */
struct dirent *
ReadDir(DIR *dir, const char *dirname)
{
    return ReadDirExtended(dir, dirname, ERROR);
}

/*
 * Alternate version that allows caller to specify the elevel for any
 * error report.  If elevel < ERROR, returns NULL on any error.
 */
static struct dirent *
ReadDirExtended(DIR *dir, const char *dirname, int elevel)
{
    struct dirent *dent;

    /* Give a generic message for AllocateDir failure, if caller didn't */
    if (dir == NULL)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not open directory \"%s\": %m",
                        dirname)));
        return NULL;
    }

    errno = 0;
    if ((dent = readdir(dir)) != NULL)
        return dent;

    if (errno)
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not read directory \"%s\": %m",
                        dirname)));
    return NULL;
}

/*
 * Close a directory opened with AllocateDir.
 *
 * Note we do not check closedir's return value --- it is up to the caller
 * to handle close errors.
 */
int
FreeDir(DIR *dir)
{
    int            i;

    DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));

    /* Remove dir from list of allocated dirs, if it's present */
    for (i = numAllocatedDescs; --i >= 0;)
    {
        AllocateDesc *desc = &allocatedDescs[i];

        if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
            return FreeDesc(desc);
    }

    /* Only get here if someone passes us a dir not in allocatedDescs */
    elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");

    return closedir(dir);
}


/*
 * Close a pipe stream returned by OpenPipeStream.
 */
int
ClosePipeStream(FILE *file)
{
    int            i;

    DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));

    /* Remove file from list of allocated files, if it's present */
    for (i = numAllocatedDescs; --i >= 0;)
    {
        AllocateDesc *desc = &allocatedDescs[i];

        if (desc->kind == AllocateDescPipe && desc->desc.file == file)
            return FreeDesc(desc);
    }

    /* Only get here if someone passes us a file not in allocatedDescs */
    elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");

    return pclose(file);
}

/*
 * closeAllVfds
 *
 * Force all VFDs into the physically-closed state, so that the fewest
 * possible number of kernel file descriptors are in use.  There is no
 * change in the logical state of the VFDs.
 */
void
closeAllVfds(void)
{
    Index        i;

    if (SizeVfdCache > 0)
    {
        Assert(FileIsNotOpen(0));    /* Make sure ring not corrupted */
        for (i = 1; i < SizeVfdCache; i++)
        {
            if (!FileIsNotOpen(i))
                LruDelete(i);
        }
    }
}


/*
 * SetTempTablespaces
 *
 * Define a list (actually an array) of OIDs of tablespaces to use for
 * temporary files.  This list will be used until end of transaction,
 * unless this function is called again before then.  It is caller's
 * responsibility that the passed-in array has adequate lifespan (typically
 * it'd be allocated in TopTransactionContext).
 */
void
SetTempTablespaces(Oid *tableSpaces, int numSpaces)
{
    Assert(numSpaces >= 0);
    tempTableSpaces = tableSpaces;
    numTempTableSpaces = numSpaces;

    /*
     * Select a random starting point in the list.  This is to minimize
     * conflicts between backends that are most likely sharing the same list
     * of temp tablespaces.  Note that if we create multiple temp files in the
     * same transaction, we'll advance circularly through the list --- this
     * ensures that large temporary sort files are nicely spread across all
     * available tablespaces.
     */
    if (numSpaces > 1)
        nextTempTableSpace = random() % numSpaces;
    else
        nextTempTableSpace = 0;
}

/*
 * TempTablespacesAreSet
 *
 * Returns TRUE if SetTempTablespaces has been called in current transaction.
 * (This is just so that tablespaces.c doesn't need its own per-transaction
 * state.)
 */
bool
TempTablespacesAreSet(void)
{
    return (numTempTableSpaces >= 0);
}

/*
 * GetNextTempTableSpace
 *
 * Select the next temp tablespace to use.  A result of InvalidOid means
 * to use the current database's default tablespace.
 */
Oid
GetNextTempTableSpace(void)
{
    if (numTempTableSpaces > 0)
    {
        /* Advance nextTempTableSpace counter with wraparound */
        if (++nextTempTableSpace >= numTempTableSpaces)
            nextTempTableSpace = 0;
        return tempTableSpaces[nextTempTableSpace];
    }
    return InvalidOid;
}


/*
 * AtEOSubXact_Files
 *
 * Take care of subtransaction commit/abort.  At abort, we close temp files
 * that the subtransaction may have opened.  At commit, we reassign the
 * files that were opened to the parent subtransaction.
 */
void
AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
                  SubTransactionId parentSubid)
{
    Index        i;

    for (i = 0; i < numAllocatedDescs; i++)
    {
        if (allocatedDescs[i].create_subid == mySubid)
        {
            if (isCommit)
                allocatedDescs[i].create_subid = parentSubid;
            else
            {
                /* have to recheck the item after FreeDesc (ugly) */
                FreeDesc(&allocatedDescs[i--]);
            }
        }
    }
}

/*
 * AtEOXact_Files
 *
 * This routine is called during transaction commit or abort (it doesn't
 * particularly care which).  All still-open per-transaction temporary file
 * VFDs are closed, which also causes the underlying files to be deleted
 * (although they should've been closed already by the ResourceOwner
 * cleanup). Furthermore, all "allocated" stdio files are closed. We also
 * forget any transaction-local temp tablespace list.
 */
void
AtEOXact_Files(void)
{
    CleanupTempFiles(false);
    tempTableSpaces = NULL;
    numTempTableSpaces = -1;
}

/*
 * AtProcExit_Files
 *
 * on_proc_exit hook to clean up temp files during backend shutdown.
 * Here, we want to clean up *all* temp files including interXact ones.
 */
static void
AtProcExit_Files(int code, Datum arg)
{
    CleanupTempFiles(true);
}

/*
 * Close temporary files and delete their underlying files.
 *
 * isProcExit: if true, this is being called as the backend process is
 * exiting. If that's the case, we should remove all temporary files; if
 * that's not the case, we are being called for transaction commit/abort
 * and should only remove transaction-local temp files.  In either case,
 * also clean up "allocated" stdio files, dirs and fds.
 */
static void
CleanupTempFiles(bool isProcExit)
{// #lizard forgives
    Index        i;

    /*
     * Careful here: at proc_exit we need extra cleanup, not just
     * xact_temporary files.
     */
    if (isProcExit || have_xact_temporary_files)
    {
        Assert(FileIsNotOpen(0));    /* Make sure ring not corrupted */
        for (i = 1; i < SizeVfdCache; i++)
        {
            unsigned short fdstate = VfdCache[i].fdstate;

            if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
            {
                /*
                 * If we're in the process of exiting a backend process, close
                 * all temporary files. Otherwise, only close temporary files
                 * local to the current transaction. They should be closed by
                 * the ResourceOwner mechanism already, so this is just a
                 * debugging cross-check.
                 */
                if (isProcExit)
                    FileClose(i);
                else if (fdstate & FD_XACT_TEMPORARY)
                {
                    elog(WARNING,
                         "temporary file %s not closed at end-of-transaction",
                         VfdCache[i].fileName);
                    FileClose(i);
                }
            }
        }

        have_xact_temporary_files = false;
    }

    /* Clean up "allocated" stdio files, dirs and fds. */
    while (numAllocatedDescs > 0)
        FreeDesc(&allocatedDescs[0]);
}


/*
 * Remove temporary and temporary relation files left over from a prior
 * postmaster session
 *
 * This should be called during postmaster startup.  It will forcibly
 * remove any leftover files created by OpenTemporaryFile and any leftover
 * temporary relation files created by mdcreate.
 *
 * NOTE: we could, but don't, call this during a post-backend-crash restart
 * cycle.  The argument for not doing it is that someone might want to examine
 * the temp files for debugging purposes.  This does however mean that
 * OpenTemporaryFile had better allow for collision with an existing temp
 * file name.
 */
void
RemovePgTempFiles(void)
{
    char        temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    DIR           *spc_dir;
    struct dirent *spc_de;

    /*
     * First process temp files in pg_default ($PGDATA/base)
     */
    snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    RemovePgTempFilesInDir(temp_path);
    RemovePgTempRelationFiles("base");

    /*
     * Cycle through temp directories for all non-default tablespaces.
     */
    spc_dir = AllocateDir("pg_tblspc");

    while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
    {
        if (strcmp(spc_de->d_name, ".") == 0 ||
            strcmp(spc_de->d_name, "..") == 0)
            continue;

#ifdef PGXC
        /* Postgres-XC tablespaces include node name in path */
        snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s_%s/%s",
                 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PGXCNodeName, PG_TEMP_FILES_DIR);
#else
        snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
                 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
#endif
        RemovePgTempFilesInDir(temp_path);

#ifdef PGXC
        /* Postgres-XC tablespaces include node name in path */
        snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s_%s",
                 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PGXCNodeName);
#else
        snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
                 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
#endif
        RemovePgTempRelationFiles(temp_path);
    }

    FreeDir(spc_dir);

    /*
     * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
     * DataDir as well.
     */
#ifdef EXEC_BACKEND
    RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
#endif
}

/* Process one pgsql_tmp directory for RemovePgTempFiles */
static void
RemovePgTempFilesInDir(const char *tmpdirname)
{
    DIR           *temp_dir;
    struct dirent *temp_de;
    char        rm_path[MAXPGPATH * 2];

    temp_dir = AllocateDir(tmpdirname);
    if (temp_dir == NULL)
    {
        /* anything except ENOENT is fishy */
        if (errno != ENOENT)
            elog(LOG,
                 "could not open temporary-files directory \"%s\": %m",
                 tmpdirname);
        return;
    }

    while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
    {
        if (strcmp(temp_de->d_name, ".") == 0 ||
            strcmp(temp_de->d_name, "..") == 0)
            continue;

        snprintf(rm_path, sizeof(rm_path), "%s/%s",
                 tmpdirname, temp_de->d_name);

        if (strncmp(temp_de->d_name,
                    PG_TEMP_FILE_PREFIX,
                    strlen(PG_TEMP_FILE_PREFIX)) == 0)
            unlink(rm_path);    /* note we ignore any error */
        else
            elog(LOG,
                 "unexpected file found in temporary-files directory: \"%s\"",
                 rm_path);
    }

    FreeDir(temp_dir);
}

/* Process one tablespace directory, look for per-DB subdirectories */
static void
RemovePgTempRelationFiles(const char *tsdirname)
{
    DIR           *ts_dir;
    struct dirent *de;
    char        dbspace_path[MAXPGPATH * 2];

    ts_dir = AllocateDir(tsdirname);
    if (ts_dir == NULL)
    {
        /* anything except ENOENT is fishy */
        if (errno != ENOENT)
            elog(LOG,
                 "could not open tablespace directory \"%s\": %m",
                 tsdirname);
        return;
    }

    while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
    {
        int            i = 0;

        /*
         * We're only interested in the per-database directories, which have
         * numeric names.  Note that this code will also (properly) ignore "."
         * and "..".
         */
        while (isdigit((unsigned char) de->d_name[i]))
            ++i;
        if (de->d_name[i] != '\0' || i == 0)
            continue;

        snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
                 tsdirname, de->d_name);
        RemovePgTempRelationFilesInDbspace(dbspace_path);
    }

    FreeDir(ts_dir);
}

/* Process one per-dbspace directory for RemovePgTempRelationFiles */
static void
RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
{
    DIR           *dbspace_dir;
    struct dirent *de;
    char        rm_path[MAXPGPATH * 2];

    dbspace_dir = AllocateDir(dbspacedirname);
    if (dbspace_dir == NULL)
    {
        /* we just saw this directory, so it really ought to be there */
        elog(LOG,
             "could not open dbspace directory \"%s\": %m",
             dbspacedirname);
        return;
    }

    while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
    {
        if (!looks_like_temp_rel_name(de->d_name))
            continue;

        snprintf(rm_path, sizeof(rm_path), "%s/%s",
                 dbspacedirname, de->d_name);

        unlink(rm_path);        /* note we ignore any error */
    }

    FreeDir(dbspace_dir);
}

/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
static bool
looks_like_temp_rel_name(const char *name)
{// #lizard forgives
    int            pos;
    int            savepos;

    /* Must start with "t". */
    if (name[0] != 't')
        return false;

    /* Followed by a non-empty string of digits and then an underscore. */
    for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
        ;
    if (pos == 1 || name[pos] != '_')
        return false;

    /* Followed by another nonempty string of digits. */
    for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
        ;
    if (savepos == pos)
        return false;

    /* We might have _forkname or .segment or both. */
    if (name[pos] == '_')
    {
        int            forkchar = forkname_chars(&name[pos + 1], NULL);

        if (forkchar <= 0)
            return false;
        pos += forkchar + 1;
    }
    if (name[pos] == '.')
    {
        int            segchar;

        for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
            ;
        if (segchar <= 1)
            return false;
        pos += segchar;
    }

    /* Now we should be at the end. */
    if (name[pos] != '\0')
        return false;
    return true;
}


/*
 * Issue fsync recursively on PGDATA and all its contents.
 *
 * We fsync regular files and directories wherever they are, but we
 * follow symlinks only for pg_wal and immediately under pg_tblspc.
 * Other symlinks are presumed to point at files we're not responsible
 * for fsyncing, and might not have privileges to write at all.
 *
 * Errors are logged but not considered fatal; that's because this is used
 * only during database startup, to deal with the possibility that there are
 * issued-but-unsynced writes pending against the data directory.  We want to
 * ensure that such writes reach disk before anything that's done in the new
 * run.  However, aborting on error would result in failure to start for
 * harmless cases such as read-only files in the data directory, and that's
 * not good either.
 *
 * Note we assume we're chdir'd into PGDATA to begin with.
 */
void
SyncDataDirectory(void)
{
    bool        xlog_is_symlink;

    /* We can skip this whole thing if fsync is disabled. */
    if (!enableFsync)
        return;

    /*
     * If pg_wal is a symlink, we'll need to recurse into it separately,
     * because the first walkdir below will ignore it.
     */
    xlog_is_symlink = false;

#ifndef WIN32
    {
        struct stat st;

        if (lstat("pg_wal", &st) < 0)
            ereport(LOG,
                    (errcode_for_file_access(),
                     errmsg("could not stat file \"%s\": %m",
                            "pg_wal")));
        else if (S_ISLNK(st.st_mode))
            xlog_is_symlink = true;
    }
#else
    if (pgwin32_is_junction("pg_wal"))
        xlog_is_symlink = true;
#endif

    /*
     * If possible, hint to the kernel that we're soon going to fsync the data
     * directory and its contents.  Errors in this step are even less
     * interesting than normal, so log them only at DEBUG1.
     */
#ifdef PG_FLUSH_DATA_WORKS
    walkdir(".", pre_sync_fname, false, DEBUG1);
    if (xlog_is_symlink)
        walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
#endif

    /*
     * Now we do the fsync()s in the same order.
     *
     * The main call ignores symlinks, so in addition to specially processing
     * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
     * process_symlinks = true.  Note that if there are any plain directories
     * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
     * so we don't worry about optimizing it.
     */
    walkdir(".", datadir_fsync_fname, false, LOG);
    if (xlog_is_symlink)
        walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
}

/*
 * walkdir: recursively walk a directory, applying the action to each
 * regular file and directory (including the named directory itself).
 *
 * If process_symlinks is true, the action and recursion are also applied
 * to regular files and directories that are pointed to by symlinks in the
 * given directory; otherwise symlinks are ignored.  Symlinks are always
 * ignored in subdirectories, ie we intentionally don't pass down the
 * process_symlinks flag to recursive calls.
 *
 * Errors are reported at level elevel, which might be ERROR or less.
 *
 * See also walkdir in initdb.c, which is a frontend version of this logic.
 */
static void
walkdir(const char *path,
        void (*action) (const char *fname, bool isdir, int elevel),
        bool process_symlinks,
        int elevel)
{// #lizard forgives
    DIR           *dir;
    struct dirent *de;

    dir = AllocateDir(path);
    if (dir == NULL)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not open directory \"%s\": %m", path)));
        return;
    }

    while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    {
        char        subpath[MAXPGPATH * 2];
        struct stat fst;
        int            sret;

        CHECK_FOR_INTERRUPTS();

        if (strcmp(de->d_name, ".") == 0 ||
            strcmp(de->d_name, "..") == 0)
            continue;

        snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);

        if (process_symlinks)
            sret = stat(subpath, &fst);
        else
            sret = lstat(subpath, &fst);

        if (sret < 0)
        {
            ereport(elevel,
                    (errcode_for_file_access(),
                     errmsg("could not stat file \"%s\": %m", subpath)));
            continue;
        }

        if (S_ISREG(fst.st_mode))
            (*action) (subpath, false, elevel);
        else if (S_ISDIR(fst.st_mode))
            walkdir(subpath, action, false, elevel);
    }

    FreeDir(dir);                /* we ignore any error here */

    /*
     * It's important to fsync the destination directory itself as individual
     * file fsyncs don't guarantee that the directory entry for the file is
     * synced.
     */
    (*action) (path, true, elevel);
}


/*
 * Hint to the OS that it should get ready to fsync() this file.
 *
 * Ignores errors trying to open unreadable files, and logs other errors at a
 * caller-specified level.
 */
#ifdef PG_FLUSH_DATA_WORKS

static void
pre_sync_fname(const char *fname, bool isdir, int elevel)
{
    int            fd;

    /* Don't try to flush directories, it'll likely just fail */
    if (isdir)
        return;

    fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);

    if (fd < 0)
    {
        if (errno == EACCES)
            return;
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not open file \"%s\": %m", fname)));
        return;
    }

    /*
     * pg_flush_data() ignores errors, which is ok because this is only a
     * hint.
     */
    pg_flush_data(fd, 0, 0);

    (void) CloseTransientFile(fd);
}

#endif                            /* PG_FLUSH_DATA_WORKS */

static void
datadir_fsync_fname(const char *fname, bool isdir, int elevel)
{
    /*
     * We want to silently ignoring errors about unreadable files.  Pass that
     * desire on to fsync_fname_ext().
     */
    fsync_fname_ext(fname, isdir, true, elevel);
}

/*
 * fsync_fname_ext -- Try to fsync a file or directory
 *
 * If ignore_perm is true, ignore errors upon trying to open unreadable
 * files. Logs other errors at a caller-specified level.
 *
 * Returns 0 if the operation succeeded, -1 otherwise.
 */
static int
fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
{// #lizard forgives
    int            fd;
    int            flags;
    int            returncode;

    /*
     * Some OSs require directories to be opened read-only whereas other
     * systems don't allow us to fsync files opened read-only; so we need both
     * cases here.  Using O_RDWR will cause us to fail to fsync files that are
     * not writable by our userid, but we assume that's OK.
     */
    flags = PG_BINARY;
    if (!isdir)
        flags |= O_RDWR;
    else
        flags |= O_RDONLY;

    fd = OpenTransientFile((char *) fname, flags, 0);

    /*
     * Some OSs don't allow us to open directories at all (Windows returns
     * EACCES), just ignore the error in that case.  If desired also silently
     * ignoring errors about unreadable files. Log others.
     */
    if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
        return 0;
    else if (fd < 0 && ignore_perm && errno == EACCES)
        return 0;
    else if (fd < 0)
    {
        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not open file \"%s\": %m", fname)));
        return -1;
    }

    returncode = pg_fsync(fd);

    /*
     * Some OSes don't allow us to fsync directories at all, so we can ignore
     * those errors. Anything else needs to be logged.
     */
    if (returncode != 0 && !(isdir && errno == EBADF))
    {
        int            save_errno;

        /* close file upon error, might not be in transaction context */
        save_errno = errno;
        (void) CloseTransientFile(fd);
        errno = save_errno;

        ereport(elevel,
                (errcode_for_file_access(),
                 errmsg("could not fsync file \"%s\": %m", fname)));
        return -1;
    }

    (void) CloseTransientFile(fd);

    return 0;
}

/*
 * fsync_parent_path -- fsync the parent path of a file or directory
 *
 * This is aimed at making file operations persistent on disk in case of
 * an OS crash or power failure.
 */
static int
fsync_parent_path(const char *fname, int elevel)
{
    char        parentpath[MAXPGPATH];

    strlcpy(parentpath, fname, MAXPGPATH);
    get_parent_directory(parentpath);

    /*
     * get_parent_directory() returns an empty string if the input argument is
     * just a file name (see comments in path.c), so handle that as being the
     * current directory.
     */
    if (strlen(parentpath) == 0)
        strlcpy(parentpath, ".", MAXPGPATH);

    if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
        return -1;

    return 0;
}
